Download the maize B73 V4 data and sorghum genome file
wget ftp://ftp.ensemblgenomes.org/pub/plants/release-34/gff3/zea_mays/Zea_mays.AGPv4.34.gff3.gz
gunzip Zea_mays.AGPv4.34.gff3.gz
wget ftp://ftp.ensemblgenomes.org/pub/plants/release-34/fasta/zea_mays/dna/Zea_mays.AGPv4.dna.toplevel.fa.gz
gunzip Zea_mays.AGPv4.dna.toplevel.fa.gz
wget ftp://ftp.ensemblgenomes.org/pub/plants/release-49/fasta/sorghum_bicolor/dna/Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa.gz
gunzip Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa.gz
Using AnchorWave to extract full-length CDS.
NOTE: please do NOT use CDS extracted using other software and do NOT use the output full-length CDS file for other purpose. Since AnchorWave filtered some CDS records to minimum the impact of minimap2 limitation on genome alignment that “Minimap2 often misses small exons” (https://github.com/lh3/minimap2#limitations)
anchorwave gff2seq -r Zea_mays.AGPv4.dna.toplevel.fa -i Zea_mays.AGPv4.34.gff3 -o cds.fa
use minimap2 (https://github.com/lh3/minimap2) to map the extracted sequence to the reference genome sequence and synthesis genomes
minimap2 -x splice -t 11 -k 12 -a -p 0.4 -N 20 Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa cds.fa > cds.sam
minimap2 -x splice -t 11 -k 12 -a -p 0.4 -N 20 Zea_mays.AGPv4.dna.toplevel.fa cds.fa > ref.sam
prepare genome annotation files
all_reproducible_peaks_summits_merged.bed has been published at https://www.nature.com/articles/s41467-020-18832-8
wget https://github.com/mcstitzer/maize_TEs/raw/master/B73.structuralTEv2.disjoined.2018-09-19.gff3.gz
gunzip B73.structuralTEv2.disjoined.2018-09-19.gff3.gz
grep -v "#" B73.structuralTEv2.disjoined.2018-09-19.gff3 | awk '{print $1"\t"$4-1"\t"$5}' | bedtools sort | bedtools merge > B73.structuralTEv2.disjoined.2018-09-19.bed
grep "biotype=protein_coding" Zea_mays.AGPv4.34.gff3 | grep -e "\sgene\s" | awk '{print $1"\t"$4-1"\t"$5}' | bedtools merge > Zea_mays.AGPv4.34_coding_gene.bed
#total length of geneitc region
cat Zea_mays.AGPv4.34_coding_gene.bed | awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 }END{print SUM}' # 162494287
grep "\sCDS\s" Zea_mays.AGPv4.34.gff3 | awk '{print $1"\t"$4-1"\t"$5}' | bedtools sort | bedtools merge > Zea_mays.AGPv4.34_CDS.bed
bedtools subtract -a Zea_mays.AGPv4.34_coding_gene.bed -b Zea_mays.AGPv4.34_CDS.bed | bedtools sort | bedtools merge > Zea_mays.AGPv4.34_coding_gene_nonCDS.bed
perform genome alignment using minimap2 and summarize the result
/usr/bin/time minimap2 -x asm5 -t 1 -a Zea_mays.AGPv4.dna.toplevel.fa Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa > minimap2_sorghum_5.sam 2> minimap2_asm5.log
/usr/bin/time minimap2 -x asm10 -t 1 -a Zea_mays.AGPv4.dna.toplevel.fa Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa > minimap2_sorghum_10.sam 2> minimap2_asm10.log
/usr/bin/time minimap2 -x asm20 -t 1 -a Zea_mays.AGPv4.dna.toplevel.fa Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa > minimap2_sorghum_20.sam 2> minimap2_asm20.log
samtools sort minimap2_sorghum_5.sam > minimap2_sorghum_5.bam
samtools sort minimap2_sorghum_10.sam > minimap2_sorghum_10.bam
samtools sort minimap2_sorghum_20.sam > minimap2_sorghum_20.bam
samtools depth minimap2_sorghum_5.bam | wc -l #2503314
samtools depth minimap2_sorghum_5.bam -b all_reproducible_peaks_summits_merged.bed | wc -l # 457652
samtools depth minimap2_sorghum_5.bam | awk '$3>0{print $0}' | wc -l #2489881
samtools depth minimap2_sorghum_5.bam -b all_reproducible_peaks_summits_merged.bed | awk '$3>0{print $0}' | wc -l #456029
samtools depth minimap2_sorghum_5.bam -b Zea_mays.AGPv4.34_coding_gene.bed | wc -l # 2332093
samtools depth minimap2_sorghum_5.bam -b Zea_mays.AGPv4.34_coding_gene.bed | awk '$3>0{print $0}' | wc -l # 2320162
samtools depth minimap2_sorghum_5.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | wc -l # 353993
samtools depth minimap2_sorghum_5.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | awk '$3>0{print $0}' | wc -l # 348042
samtools depth minimap2_sorghum_10.bam | wc -l # 18445054
samtools depth minimap2_sorghum_10.bam -b all_reproducible_peaks_summits_merged.bed | wc -l # 3401133
samtools depth minimap2_sorghum_10.bam | awk '$3>0{print $0}' | wc -l # 17913090
samtools depth minimap2_sorghum_10.bam -b all_reproducible_peaks_summits_merged.bed | awk '$3>0{print $0}' | wc -l #3347285
samtools depth minimap2_sorghum_10.bam -b Zea_mays.AGPv4.34_coding_gene.bed | wc -l # 17619682
samtools depth minimap2_sorghum_10.bam -b Zea_mays.AGPv4.34_coding_gene.bed | awk '$3>0{print $0}' | wc -l # 17118348
samtools depth minimap2_sorghum_10.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | wc -l # 2354836
samtools depth minimap2_sorghum_10.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | awk '$3>0{print $0}' | wc -l # 2259444
samtools depth minimap2_sorghum_20.bam | wc -l # 52224881
samtools depth minimap2_sorghum_20.bam -b all_reproducible_peaks_summits_merged.bed | wc -l #11113820
samtools depth minimap2_sorghum_20.bam | awk '$3>0{print $0}' | wc -l # 49024035
samtools depth minimap2_sorghum_20.bam -b all_reproducible_peaks_summits_merged.bed | awk '$3>0{print $0}' | wc -l # 10680401
samtools depth minimap2_sorghum_20.bam -b Zea_mays.AGPv4.34_coding_gene.bed | wc -l # 47118588
samtools depth minimap2_sorghum_20.bam -b Zea_mays.AGPv4.34_coding_gene.bed | awk '$3>0{print $0}' | wc -l # 44182530
samtools depth minimap2_sorghum_20.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | wc -l # 7728976
samtools depth minimap2_sorghum_20.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | awk '$3>0{print $0}' | wc -l # 7132545
perform genome alignment using LAST(http://last.cbrc.jp/) and summarize the result
/usr/bin/time sh ./lastalPipeline.sh >last.log 2>&1
python2 maf-convert sam sorghum_lastal.maf > sorghum_lastal.sam
sed -i 's/[0-9]\+H//g' sorghum_lastal.sam
samtools view -O BAM --reference Zea_mays.AGPv4.dna.toplevel.fa sorghum_lastal.sam | samtools sort - > sorghum_lastal.bam; samtools index sorghum_lastal.bam # this is the many-to-many alignment
samtools depth sorghum_lastal.bam | wc -l # 597884081
samtools depth sorghum_lastal.bam | awk '$3>0{print $0}' | wc -l # 594220046
samtools depth sorghum_lastal.bam -b all_reproducible_peaks_summits_merged.bed | wc -l # 33554729
samtools depth sorghum_lastal.bam -b all_reproducible_peaks_summits_merged.bed | awk '$3>0{print $0}' | wc -l # 32598984
samtools depth sorghum_lastal.bam -b Zea_mays.AGPv4.34_coding_gene.bed | wc -l # 118438260
samtools depth sorghum_lastal.bam -b Zea_mays.AGPv4.34_coding_gene.bed | awk '$3>0{print $0}' | wc -l # 116732768
samtools depth sorghum_lastal.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | wc -l # 365399859
samtools depth sorghum_lastal.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | awk '$3>0{print $0}' | wc -l # 364395540
python2 maf-convert sam sorghum_lastal_final.maf > sorghum_lastal_final.sam # this many to one
sed -i 's/query.//g' sorghum_lastal_final.sam
sed -i 's/col.//g' sorghum_lastal_final.sam
sed -i 's/[0-9]\+H//g' sorghum_lastal_final.sam
samtools view -O BAM --reference Zea_mays.AGPv4.dna.toplevel.fa sorghum_lastal_final.sam | samtools sort - > sorghum_lastal_final.bam; samtools index sorghum_lastal_final.bam
samtools depth sorghum_lastal_final.bam | wc -l # 547513366
samtools depth sorghum_lastal_final.bam -b all_reproducible_peaks_summits_merged.bed | wc -l # 32593361
samtools depth sorghum_lastal_final.bam | awk '$3>0{print $0}' | wc -l # 541455727
samtools depth sorghum_lastal_final.bam -b all_reproducible_peaks_summits_merged.bed | awk '$3>0{print $0}' | wc -l # 31471791
samtools depth sorghum_lastal_final.bam -b Zea_mays.AGPv4.34_coding_gene.bed | wc -l # 113229899
samtools depth sorghum_lastal_final.bam -b Zea_mays.AGPv4.34_coding_gene.bed | awk '$3>0{print $0}' | wc -l # 110793737
samtools depth sorghum_lastal_final.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | wc -l # 337208689
samtools depth sorghum_lastal_final.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | awk '$3>0{print $0}' | wc -l # 335012548
python2 maf-convert sam sorghum_lastal_final_split_swap_split_Comparator.maf > sorghum_lastal_final_split_swap_split_Comparator.sam # one-to-one
sed -i 's/query.//g' sorghum_lastal_final_split_swap_split_Comparator.sam
sed -i 's/col.//g' sorghum_lastal_final_split_swap_split_Comparator.sam
sed -i 's/[0-9]\+H//g' sorghum_lastal_final_split_swap_split_Comparator.sam
samtools view -O BAM --reference Zea_mays.AGPv4.dna.toplevel.fa sorghum_lastal_final_split_swap_split_Comparator.sam | samtools sort - > sorghum_lastal_final_split_swap_split_Comparator.bam; samtools index sorghum_lastal_final_split_swap_split_Comparator.bam
samtools depth sorghum_lastal_final_split_swap_split_Comparator.bam | wc -l # 129898533
samtools depth sorghum_lastal_final_split_swap_split_Comparator.bam -b all_reproducible_peaks_summits_merged.bed | wc -l # 23170062
samtools depth sorghum_lastal_final_split_swap_split_Comparator.bam | awk '$3>0{print $0}' | wc -l # 127859061
samtools depth sorghum_lastal_final_split_swap_split_Comparator.bam -b all_reproducible_peaks_summits_merged.bed | awk '$3>0{print $0}' | wc -l # 22483132
samtools depth sorghum_lastal_final_split_swap_split_Comparator.bam -b Zea_mays.AGPv4.34_coding_gene.bed | wc -l # 62730883
samtools depth sorghum_lastal_final_split_swap_split_Comparator.bam -b Zea_mays.AGPv4.34_coding_gene.bed | awk '$3>0{print $0}' | wc -l # 61595170
samtools depth sorghum_lastal_final_split_swap_split_Comparator.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | wc -l # 46540254
samtools depth sorghum_lastal_final_split_swap_split_Comparator.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | awk '$3>0{print $0}' | wc -l # 46169395
perform genome alignment using AnchorWave and summarize the result
/usr/bin/time ./code/anchorwave proali -i Zea_mays.AGPv4.34.gff3 -as cds.fa -r Zea_mays.AGPv4.dna.toplevel.fa -a cds.sam -ar ref.sam -s Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa -n anchorwave.anchors -R 1 -Q 2 -o anchorwave.maf -f anchorwave.f.maf -w 38000 -fa3 200000 -B -4 -O1 -4 -E1 -2 -O2 -80 -E2 -1 -t 1 >anchorwave.log 2>&1
maf-convert sam anchorwave.maf | sed 's/Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa.//g' | sed 's/Zea_mays.AGPv4.dna.toplevel.fa.//g' | samtools view -O BAM --reference Zea_mays.AGPv4.dna.toplevel.fa - | samtools sort - > anchorwave.bam
samtools mpileup anchorwave.bam | wc -l # 1870770572
samtools depth anchorwave.bam -b all_reproducible_peaks_summits_merged.bed | wc -l # 57164808
samtools depth anchorwave.bam | awk '$3>0{print $0}' | wc -l # 125848664
samtools depth anchorwave.bam -b all_reproducible_peaks_summits_merged.bed | awk '$3>0{print $0}' | wc -l # 34133000
samtools depth anchorwave.bam -b Zea_mays.AGPv4.34_coding_gene.bed | wc -l # 150034551
samtools depth anchorwave.bam -b Zea_mays.AGPv4.34_coding_gene.bed | awk '$3>0{print $0}' | wc -l #68301524
samtools depth anchorwave.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | wc -l # 1308576282
samtools depth anchorwave.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | awk '$3>0{print $0}' | wc -l # 27551272
perform genome alignment using MUMmer4 vRC1 (https://mummer4.github.io/) and summarize the result
#/usr/bin/time /home/bs674/software/bin/nucmer -t 60 --sam-short=mumer.maize.short.sam Zea_mays.AGPv4.dna.toplevel.fa Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa > nucmer.log 2>&1
#/usr/bin/time /home/bs674/software/bin/nucmer -t 69 --sam-long=mumer.maize.long.sam Zea_mays.AGPv4.dna.toplevel.fa Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa > nucmer.long.log 2>&1
/usr/bin/time /home/bs674/software/bin/nucmer -t 1 --sam-short=mumer.maize.short.sam Zea_mays.AGPv4.dna.toplevel.fa Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa > nucmer.log 2>&1
cat mumer.maize.short.sam | grep -v "@" | samtools view -O BAM --reference Zea_mays.AGPv4.dna.toplevel.fa - | samtools sort - > mumer.maize.short.bam
samtools index mumer.maize.short.bam
samtools depth mumer.maize.short.bam | wc -l # 70953295
samtools depth mumer.maize.short.bam | awk '$3>0{print $0}' | wc -l #69514918
samtools depth mumer.maize.short.bam -b all_reproducible_peaks_summits_merged.bed | wc -l # 16073863
samtools depth mumer.maize.short.bam -b all_reproducible_peaks_summits_merged.bed | awk '$3>0{print $0}' | wc -l # 15687267
samtools depth mumer.maize.short.bam -b Zea_mays.AGPv4.34_coding_gene.bed | wc -l # 61402015
samtools depth mumer.maize.short.bam -b Zea_mays.AGPv4.34_coding_gene.bed | awk '$3>0{print $0}' | wc -l # 60194318
samtools depth mumer.maize.short.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | wc -l # 10169800
samtools depth mumer.maize.short.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | awk '$3>0{print $0}' | wc -l # 9987182
perform genome alignment using GSAlign(https://github.com/hsinnan75/GSAlign) and summarize the result
/usr/bin/time /home/bs674/software/GSAlign-1.0.22/bin/GSAlign -r Zea_mays.AGPv4.dna.toplevel.fa -q Sorghum_bicolor.Sorghum_bicolor_NCBIv3.dna.toplevel.fa -t 1 -o sorghum_gsalign -fmt 1 > GSAlign.log 2>&1
maf-convert sam sorghum_gsalign.maf | sed 's/qry.//g' | sed 's/ref.//g' | samtools view -O BAM --reference Zea_mays.AGPv4.dna.toplevel.fa - | samtools sort - > sorghum_gsalign.bam
samtools index sorghum_gsalign.bam
samtools depth sorghum_gsalign.bam -b all_reproducible_peaks_summits_merged.bed | wc -l # 5341494
samtools depth sorghum_gsalign.bam | wc -l # 23870692
samtools depth sorghum_gsalign.bam | awk '$3>0{print $0}' | wc -l # 23114133
samtools depth sorghum_gsalign.bam -b all_reproducible_peaks_summits_merged.bed | awk '$3>0{print $0}' | wc -l # 5138842
samtools depth sorghum_gsalign.bam -b Zea_mays.AGPv4.34_coding_gene.bed | wc -l # 21659744
samtools depth sorghum_gsalign.bam -b Zea_mays.AGPv4.34_coding_gene.bed | awk '$3>0{print $0}' | wc -l #21015569
samtools depth sorghum_gsalign.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | wc -l # 3322894
samtools depth sorghum_gsalign.bam -b B73.structuralTEv2.disjoined.2018-09-19.bed | awk '$3>0{print $0}' | wc -l # 3214544
dat = read.table("all_reproducible_peaks_summits_merged.bed")
dat$len = dat$V3-dat$V2
sum(abs(dat$len))
## [1] 60036753
#60036753
TE2018 = read.table("B73.structuralTEv2.disjoined.2018-09-19.bed")
TE2018$len = TE2018$V3-TE2018$V2
TE2018TotalLength = sum(abs(TE2018$len)) # 1495364259
library(ggplot2)
data = read.table("summaryData", header=TRUE, sep="\t")
data$recall = data$depth_covered_bed / data$bed
data$non_TFBS_matches = data$depth_covered - data$depth_covered_bed
data$non_TFBS = data$depth - data$depth_bed
data$enrichment = (data$depth_covered_bed/data$depth_bed)/(data$non_TFBS_matches/data$non_TFBS)
data$approach <- factor(data$approach, levels = c("AnchorWave", "minimap2_asm5", "minimap2_asm10", "minimap2_asm20", "LAST many-to-many", "LAST many-to-one", "LAST one-to-one", "MUMmer4", "GSAlign"))
data$software <- factor(data$software, levels = c("AnchorWave", "minimap2", "LAST", "MUMmer4", "GSAlign"))
shape = c(16, 17, 17, 17, 15, 15, 15, 3, 7)
myColors <- c("#F8766D", "#D89000", "#A3A500", "#39B600", "#00BFC4", "#00B0F6", "#9590FF", "#E76BF3", "#FF62BC")
names(myColors) <- levels(data$approach)
colScale <- scale_colour_manual(name = "grp",values = myColors)
fillScale <- scale_fill_manual(name = "grp",values = myColors)
print(data)
## software approach depth depth_bed depth_covered
## 1 minimap2 minimap2_asm5 2503314 457652 2489881
## 2 minimap2 minimap2_asm10 18445054 3401133 17913090
## 3 minimap2 minimap2_asm20 52224881 11113820 49024035
## 4 MUMmer4 MUMmer4 70953295 16073863 69514918
## 5 GSAlign GSAlign 23870692 5341494 23114133
## 6 LAST LAST many-to-many 597884081 33554729 594220046
## 7 LAST LAST many-to-one 547513366 32593361 541455727
## 8 LAST LAST one-to-one 129898533 23170062 127859061
## 9 AnchorWave AnchorWave 1870770572 57164808 125848664
## depth_covered_bed bed depth_genetic depth_genetic_covered X2018.09.19
## 1 456029 60036753 2332093 2320162 353993
## 2 3347285 60036753 17619682 17118348 2354836
## 3 10680401 60036753 47118588 44182530 7728976
## 4 15687267 60036753 61402015 60194318 10169800
## 5 5138842 60036753 21659744 21015569 3322894
## 6 32598984 60036753 118438260 116732768 365399859
## 7 31471791 60036753 113229899 110793737 337208689
## 8 22483132 60036753 62730883 61595170 46540254
## 9 34133000 60036753 150034551 68301524 1308576282
## X2018.09.19_covered recall non_TFBS_matches non_TFBS enrichment
## 1 348042 0.007595831 2033852 2045662 1.0022398
## 2 2259444 0.055753931 14565805 15043921 1.0164725
## 3 7132545 0.177897712 38343634 41111061 1.0303615
## 4 9987182 0.261294394 53827651 54879432 0.9950186
## 5 3214544 0.085594935 17975291 18529198 0.9917066
## 6 364395540 0.542983795 561621062 564329352 0.9762018
## 7 335012548 0.524208746 509983936 514920005 0.9749348
## 8 46169395 0.374489473 105375929 106728471 0.9828076
## 9 27551272 0.568535077 91715664 1813605764 11.8071501
p = ggplot(data=data, aes(x=recall, y=enrichment, color=approach, shape=software)) + geom_point(size=5, alpha=0.8) +
guides(colour = guide_legend(override.aes = list(shape = shape))) +scale_shape(guide = FALSE)+ colScale+fillScale+
labs(x="TFBS recall", y="Match ratio in TFBS\nalignment to match ratio\n in non-TFBS alignment", title="")+ xlim(0, 1) + ylim(0, 12) +
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text( colour = "black"),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
file = "TFBS_quality_control"
png(paste(file, ".png", sep=""), width=720, height=560)
print(p)
dev.off()
## png
## 2
pdf(paste(file, ".pdf", sep=""), width=9, height=7)
print(p)
dev.off()
## png
## 2
Define the “recall” as (# position-match base pairs in TFBS and coding genetic region)/(total TFBS length + coding genetic region) By “coding genetic region”, I mean from TSS to TTS of coding genes (inlcuding introns, UTR, CDS) Define “precision” as (# position-match base pairs in TFBS and coding genetic region)/(# position-match base pairs in TFBS and coding genetic region + # position-match base pairs in TE region)
This is not a very solid analysis. But did not find much more convincing way to do that.
data$TFBS_coding_genes_recall = (data$depth_covered_bed+data$depth_genetic_covered) / (data$bed+161046032)
data$TFBS_coding_genes_precision = (data$depth_covered_bed+data$depth_genetic_covered)/(data$depth_covered_bed+data$depth_genetic_covered + data$X2018.09.19_covered)
data$TFBS_coding_genes_fscore = 2*(data$TFBS_coding_genes_precision*data$TFBS_coding_genes_recall)/(data$TFBS_coding_genes_precision+data$TFBS_coding_genes_recall)
p = ggplot(data=data, aes(x=approach, y=TFBS_coding_genes_fscore)) + geom_bar(stat="identity", alpha=0.8) +guides(color = FALSE, fill=FALSE)+
labs(x="", y="F-score", title="TFBS+coding_genes") + colScale+fillScale+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
p = ggplot(data=data, aes(x=TFBS_coding_genes_recall, y=TFBS_coding_genes_precision, color=software, shape=software)) + geom_point(alpha=0.8) +
labs(x="recall", y="precision", title="TFBS+coding_genes") + colScale+fillScale+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
data$coding_genes_recall = (data$depth_genetic_covered) / (161046032)
data$coding_genes_precision = (data$depth_genetic_covered)/(data$depth_genetic_covered + data$X2018.09.19_covered)
data$coding_genes_fscore = 2*(data$coding_genes_precision*data$coding_genes_recall)/(data$coding_genes_precision+data$coding_genes_recall)
p = ggplot(data=data, aes(x=approach, y=coding_genes_fscore)) + geom_bar(stat="identity", alpha=0.8) +guides(color = FALSE, fill=FALSE)+
labs(x="", y="F-score", title="coding_genes") + colScale+fillScale+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
p = ggplot(data=data, aes(x=coding_genes_recall, y=coding_genes_precision, color=software, shape=software)) + geom_point(alpha=0.8) +guides(color = FALSE, fill=FALSE)+
labs(x="recall", y="precision", title="coding_genes") + colScale+fillScale+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
data$TFBS_recall = (data$depth_covered_bed) / (data$bed)
data$TFBS_precision = (data$depth_covered_bed)/(data$depth_covered_bed + data$X2018.09.19_covered)
data$TFBS_fscore = 2*(data$TFBS_precision*data$TFBS_recall)/(data$TFBS_precision+data$TFBS_recall)
p = ggplot(data=data, aes(x=approach, y=TFBS_fscore)) + geom_bar(stat="identity", alpha=0.8)+
labs(x="", y="F-score", title="TFBS") + colScale+fillScale+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
p = ggplot(data=data, aes(x=TFBS_recall, y=TFBS_precision, color=software, shape=software)) + geom_point(alpha=0.8) +
labs(x="recall", y="precision", title="TFBS") + colScale+fillScale+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
p = ggplot(data=data, aes(x=approach, y=recall, color=approach, fill=approach)) + geom_bar(stat="identity", alpha=0.8) +guides(color = FALSE, fill=FALSE)+
labs(x="", y="Proportion of maize\nTFBS matched to\nsorghum genome", title="") + colScale+fillScale+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text( colour = "black"),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
# axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
file = "TFBS_recall"
png(paste(file, ".png", sep=""), width=720, height=320)
print(p)
dev.off()
## png
## 2
pdf(paste(file, ".pdf", sep=""), width=9, height=4)
print(p)
dev.off()
## png
## 2
totalTELength = read.table("B73.structuralTEv2.disjoined.2018-09-19.bed")
totalTELength = sum(abs(totalTELength$V3-totalTELength$V2))
totalGenomeLength = read.table("Zea_mays.AGPv4.dna.toplevel.fa.fai")
totalGenomeLength = sum(totalGenomeLength$V2)
data$coverage = data$depth/totalGenomeLength
dat = rbind( data.frame(approach=data$approach, proportion=data$depth_covered/totalGenomeLength, category = "position match"), data.frame(approach=data$approach, proportion=(data$depth-data$depth_covered)/totalGenomeLength, category = "gap"), data.frame(approach=data$approach, proportion=(totalGenomeLength-data$depth)/totalGenomeLength, category = "unaligned") )
dat$category = factor(dat$category, levels=c("unaligned", "gap", "position match"))
p = ggplot(data=dat, aes(x=approach, y=proportion, fill=category)) + geom_bar(stat="identity") + scale_fill_manual(values=c("#54AEE1", "#92A000", "#EF8600")) + guides(fill=guide_legend(nrow=1,byrow=TRUE))+
labs(x="", y="Proportion of maize \n genome aligned to sorghum", title="")+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
legend.position = "top",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text( colour = "black"),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
file = "maize_sorghum_genome_aligned"
png(paste(file, ".png", sep=""), width=720, height=560)
print(p)
dev.off()
## png
## 2
pdf(paste(file, ".pdf", sep=""), width=9, height=7)
print(p)
dev.off()
## png
## 2
dat = rbind( data.frame(approach=data$approach, proportion=data$depth_covered/totalGenomeLength, category = "position match"), data.frame(approach=data$approach, proportion=(data$depth-data$depth_covered)/totalGenomeLength, category = "gap"), data.frame(approach=data$approach, proportion=(totalGenomeLength-data$depth)/totalGenomeLength, category = "unaligned") )
dat = rbind(
data.frame(approach=data$approach, proportion=(data$depth_covered-data$X2018.09.19_covered)/totalGenomeLength, c="6non-TE position match", category = "position match", region="Non-TE_PAV"),
data.frame(approach=data$approach, proportion=data$X2018.09.19_covered/totalGenomeLength, c="5TE position match", category = "position match", region="TE_PAV"),
data.frame(approach=data$approach, proportion=(data$depth-data$depth_covered - data$X2018.09.19 + data$X2018.09.19_covered)/totalGenomeLength, c="3non-TE gap", category = "gap", region="Non-TE_PAV"),
data.frame(approach=data$approach, proportion=(data$X2018.09.19-data$X2018.09.19_covered)/totalGenomeLength, c="4TE gap", category = "gap", region="TE_PAV"),
data.frame(approach=data$approach, proportion=(totalGenomeLength-data$depth-totalTELength+data$X2018.09.19)/totalGenomeLength, c="2non-TE unaligned", category = "unaligned", region="Non-TE_PAV"),
data.frame(approach=data$approach, proportion=(totalTELength-data$X2018.09.19)/totalGenomeLength, c="1TE unaligned", category = "unaligned", region="TE_PAV") )
dat$category = factor(dat$category, levels=c("unaligned", "gap", "position match"))
dat$c = factor(dat$c, levels=c("1TE unaligned", "2non-TE unaligned", "3non-TE gap", "4TE gap", "5TE position match", "6non-TE position match"))
dat$region = factor(dat$region)
print(dat)
## approach proportion c category
## 1 minimap2_asm5 1.003164e-03 6non-TE position match position match
## 2 minimap2_asm10 7.331633e-03 6non-TE position match position match
## 3 minimap2_asm20 1.962054e-02 6non-TE position match position match
## 4 MUMmer4 2.788076e-02 6non-TE position match position match
## 5 GSAlign 9.320288e-03 6non-TE position match position match
## 6 LAST many-to-many 1.076420e-01 6non-TE position match position match
## 7 LAST many-to-one 9.669094e-02 6non-TE position match position match
## 8 LAST one-to-one 3.826065e-02 6non-TE position match position match
## 9 AnchorWave 4.603914e-02 6non-TE position match position match
## 10 minimap2_asm5 1.630110e-04 5TE position match position match
## 11 minimap2_asm10 1.058246e-03 5TE position match position match
## 12 minimap2_asm20 3.340641e-03 5TE position match position match
## 13 MUMmer4 4.677655e-03 5TE position match position match
## 14 GSAlign 1.505583e-03 5TE position match position match
## 15 LAST many-to-many 1.706704e-01 5TE position match position match
## 16 LAST many-to-one 1.569084e-01 5TE position match position match
## 17 LAST one-to-one 2.162417e-02 5TE position match position match
## 18 AnchorWave 1.290408e-02 5TE position match position match
## 19 minimap2_asm5 3.504313e-06 3non-TE gap gap
## 20 minimap2_asm10 2.044754e-04 3non-TE gap gap
## 21 minimap2_asm20 1.219819e-03 3non-TE gap gap
## 22 MUMmer4 5.881546e-04 3non-TE gap gap
## 23 GSAlign 3.035990e-04 3non-TE gap gap
## 24 LAST many-to-many 1.245720e-03 3non-TE gap gap
## 25 LAST many-to-one 1.808594e-03 3non-TE gap gap
## 26 LAST one-to-one 7.815214e-04 3non-TE gap gap
## 27 AnchorWave 2.172735e-01 3non-TE gap gap
## 28 minimap2_asm5 2.787245e-06 4TE gap gap
## 29 minimap2_asm10 4.467836e-05 4TE gap gap
## 30 minimap2_asm20 2.793479e-04 4TE gap gap
## 31 MUMmer4 8.553204e-05 4TE gap gap
## 32 GSAlign 5.074744e-05 4TE gap gap
## 33 LAST many-to-many 4.703887e-04 4TE gap gap
## 34 LAST many-to-one 1.028597e-03 4TE gap gap
## 35 LAST one-to-one 1.736977e-04 4TE gap gap
## 36 AnchorWave 5.999884e-01 4TE gap gap
## 37 minimap2_asm5 2.986158e-01 2non-TE unaligned unaligned
## 38 minimap2_asm10 2.920863e-01 2non-TE unaligned unaligned
## 39 minimap2_asm20 2.787821e-01 2non-TE unaligned unaligned
## 40 MUMmer4 2.711535e-01 2non-TE unaligned unaligned
## 41 GSAlign 2.899986e-01 2non-TE unaligned unaligned
## 42 LAST many-to-many 1.907348e-01 2non-TE unaligned unaligned
## 43 LAST many-to-one 2.011229e-01 2non-TE unaligned unaligned
## 44 LAST one-to-one 2.605803e-01 2non-TE unaligned unaligned
## 45 AnchorWave 3.630983e-02 2non-TE unaligned unaligned
## 46 minimap2_asm5 7.002118e-01 1TE unaligned unaligned
## 47 minimap2_asm10 6.992746e-01 1TE unaligned unaligned
## 48 minimap2_asm20 6.967576e-01 1TE unaligned unaligned
## 49 MUMmer4 6.956144e-01 1TE unaligned unaligned
## 50 GSAlign 6.988212e-01 1TE unaligned unaligned
## 51 LAST many-to-many 5.292367e-01 1TE unaligned unaligned
## 52 LAST many-to-one 5.424405e-01 1TE unaligned unaligned
## 53 LAST one-to-one 6.785797e-01 1TE unaligned unaligned
## 54 AnchorWave 8.748511e-02 1TE unaligned unaligned
## region
## 1 Non-TE_PAV
## 2 Non-TE_PAV
## 3 Non-TE_PAV
## 4 Non-TE_PAV
## 5 Non-TE_PAV
## 6 Non-TE_PAV
## 7 Non-TE_PAV
## 8 Non-TE_PAV
## 9 Non-TE_PAV
## 10 TE_PAV
## 11 TE_PAV
## 12 TE_PAV
## 13 TE_PAV
## 14 TE_PAV
## 15 TE_PAV
## 16 TE_PAV
## 17 TE_PAV
## 18 TE_PAV
## 19 Non-TE_PAV
## 20 Non-TE_PAV
## 21 Non-TE_PAV
## 22 Non-TE_PAV
## 23 Non-TE_PAV
## 24 Non-TE_PAV
## 25 Non-TE_PAV
## 26 Non-TE_PAV
## 27 Non-TE_PAV
## 28 TE_PAV
## 29 TE_PAV
## 30 TE_PAV
## 31 TE_PAV
## 32 TE_PAV
## 33 TE_PAV
## 34 TE_PAV
## 35 TE_PAV
## 36 TE_PAV
## 37 Non-TE_PAV
## 38 Non-TE_PAV
## 39 Non-TE_PAV
## 40 Non-TE_PAV
## 41 Non-TE_PAV
## 42 Non-TE_PAV
## 43 Non-TE_PAV
## 44 Non-TE_PAV
## 45 Non-TE_PAV
## 46 TE_PAV
## 47 TE_PAV
## 48 TE_PAV
## 49 TE_PAV
## 50 TE_PAV
## 51 TE_PAV
## 52 TE_PAV
## 53 TE_PAV
## 54 TE_PAV
library(ggpattern)
##
## Attaching package: 'ggpattern'
## The following objects are masked from 'package:ggplot2':
##
## flip_data, flipped_names, gg_dep, has_flipped_aes, remove_missing,
## should_stop, waiver
p = ggplot(data=dat, aes(x=approach, y=proportion, fill=c, pattern = region)) + geom_bar(stat="identity") + scale_fill_manual(values=c("#54AEE1","#54AEE1", "#92A000", "#92A000", "#EF8600", "#EF8600")) + guides(fill=guide_legend(nrow=1,byrow=TRUE))+
labs(x="", y="Proportion of maize \n genome aligned to sorghum", title="")+
geom_bar_pattern( stat="identity", color = "black", pattern_fill = "black",
pattern_angle = 45,
pattern_density = 0.1,
pattern_spacing = 0.025,
pattern_key_scale_factor = 0.6) +
scale_pattern_manual(values = c(TE_PAV = "stripe", TE_PAV = "none")) +
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
legend.position = "top",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text( colour = "black"),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
file = "maize_sorghum_genome_aligned_v2"
png(paste(file, ".png", sep=""), width=720, height=560)
print(p)
dev.off()
## png
## 2
pdf(paste(file, ".pdf", sep=""), width=9, height=7)
print(p)
dev.off()
## png
## 2
data0 = data.frame(software=c("ideal expection", as.character(data$software)), approach=c("ideal expection", as.character(data$approach)), coverage=c(0, (data$X2018.09.19_covered)/data$depth_covered) )
data0$approach <- factor(data0$approach, levels = c("ideal expection", "AnchorWave", "minimap2_asm5", "minimap2_asm10", "minimap2_asm20","LAST many-to-many", "LAST many-to-one", "LAST one-to-one", "MUMmer4", "GSAlign"))
data0$software <- factor(data0$software, levels = c("ideal expection", "AnchorWave", "minimap2", "LAST", "MUMmer4", "GSAlign"))
myColors <- c("#00FF00", "#F8766D", "#D89000", "#A3A500", "#39B600", "#00BFC4", "#00B0F6", "#9590FF", "#E76BF3", "#FF62BC")
names(myColors) <- levels(data$approach)
colScale <- scale_colour_manual(name = "grp",values = myColors)
fillScale <- scale_fill_manual(name = "grp",values = myColors)
p = ggplot(data=data0, aes(x=approach, y=coverage, color=approach, fill=approach)) + geom_bar(stat="identity", alpha=0.8) +guides(color = FALSE, fill = FALSE)+
labs(x="", y="Proportion of maize-sorghum\ngenome alignmenet matchs\nin maize TE region", title="")+ylim(0, 1)+colScale+fillScale+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text( colour = "black"),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = c("#00FF00", "#000000", "#000000", "#000000", "#000000", "#000000", "#000000", "#000000", "#000000", "#000000", "#000000")))
print(p)
file = "2018.09.19_te_depth"
png(paste(file, ".png", sep=""), width=720, height=560)
print(p)
dev.off()
## png
## 2
pdf(paste(file, ".pdf", sep=""), width=9, height=7)
print(p)
dev.off()
## png
## 2
data$X2018.09.19_gaps = data$X2018.09.19 - data$X2018.09.19_covered
data$total_gaps = data$depth - data$depth_covered
data$non_TE_gaps = data$total_gaps - data$X2018.09.19_gaps
data$non_TE_matches = data$depth_covered - data$X2018.09.19_covered
data$non_TE = data$depth - data$X2018.09.19
data0 = data.frame(software=data$software, approach=data$approach, coverage=((data$X2018.09.19_gaps/data$X2018.09.19) / (data$non_TE_gaps/data$non_TE ) )) # the value of 162494287 is the total genetic region length, which is counted above in this file
p = ggplot(data=data0, aes(x=approach, y=coverage, color=approach, fill=approach)) + geom_bar(stat="identity", alpha=0.8) +guides(color = FALSE, fill = FALSE)+ylim(0, 5)+geom_hline(yintercept=1, linetype="dashed", color = "gray", size=1) +
labs(x="", y="Gap ratio in TE\nalignment to gap ratio\nin non-TE alignment", title="")+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text( colour = "black"),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
file = "2018.09.19_te_gap_ratio"
png(paste(file, ".png", sep=""), width=720, height=560)
print(p)
dev.off()
## png
## 2
pdf(paste(file, ".pdf", sep=""), width=9, height=7)
print(p)
dev.off()
## png
## 2
data0 = data.frame(software=data$software, approach=data$approach, coverage=( (data$non_TE_gaps/data$non_TE ) )) # the value of 162494287 is the total genetic region length, which is counted above in this file
p = ggplot(data=data0, aes(x=approach, y=coverage, color=approach, fill=approach)) + geom_bar(stat="identity", alpha=0.8) +guides(color = FALSE, fill = FALSE)+ylim(0, 1.1)+geom_hline(yintercept=1, linetype="dashed", color = "gray", size=1) +
labs(x="", y="Gap ratio in non-TE", title="")+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text( colour = "black"),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
# axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
data0 = data.frame(software=data$software, approach=data$approach, coverage=( (data$X2018.09.19_gaps/data$X2018.09.19 ) )) # the value of 162494287 is the total genetic region length, which is counted above in this file
p = ggplot(data=data0, aes(x=approach, y=coverage, color=approach, fill=approach)) + geom_bar(stat="identity", alpha=0.8) +guides(color = FALSE, fill = FALSE)+ylim(0, 1.1)+geom_hline(yintercept=1, linetype="dashed", color = "gray", size=1) +
labs(x="", y="Gap ratio in TE", title="")+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text( colour = "black"),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
# axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
data0 = data.frame(software=data$software, approach=data$approach, coverage=((data$X2018.09.19_covered/data$X2018.09.19) / (data$non_TE_matches/data$non_TE ) )) # the value of 162494287 is the total genetic region length, which is counted above in this file
p = ggplot(data=data0, aes(x=approach, y=coverage, color=approach, fill=approach)) + geom_bar(stat="identity", alpha=0.8) +guides(color = FALSE, fill = FALSE)+ylim(0, 1.1)+geom_hline(yintercept=1, linetype="dashed", color = "gray", size=1) +
labs(x="", y="Match ratio in TE\nalignment to match ratio\n in non-TE alignment", title="")+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text( colour = "black"),
axis.text.x = element_blank(),
axis.ticks.x = element_blank())
# axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
file = "2018.09.19_te_match_ratio"
png(paste(file, ".png", sep=""), width=720, height=320)
print(p)
dev.off()
## png
## 2
pdf(paste(file, ".pdf", sep=""), width=9, height=4)
print(p)
dev.off()
## png
## 2
data0 = data.frame(software=data$software, approach=data$approach, ratio=((data$X2018.09.19-data$X2018.09.19_covered)/data$X2018.09.19), coverage=(data$X2018.09.19 - data$X2018.09.19_covered)/TE2018TotalLength )
p = ggplot(data=data0, aes(x=coverage, y=ratio, color=approach, shape=software)) + geom_point(size=5, alpha=0.8) +
guides(colour = guide_legend(override.aes = list(shape = shape))) +scale_shape(guide = FALSE)+
labs(x="Gap ratio in maize TEs(recall) of\nmaize to sorghum genome alignment", y="Gap ratio in maize TE alignments\n(precision)", title="")+xlim(0,1)+ylim(0,1)+
theme_bw() +theme_grey(base_size = 24) + theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.title = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text( colour = "black"),
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black"))
print(p)
file = "2018.09.19_te_depth_2"
png(paste(file, ".png", sep=""), width=720, height=560)
print(p)
dev.off()
## png
## 2
pdf(paste(file, ".pdf", sep=""), width=9, height=7)
print(p)
dev.off()
## png
## 2
# here I am using the Cairo library to compile the output plot. The output file looks better than native library, but it a little bit of mass up the Rmarkdown output file.
library(ggplot2)
library(compiler)
enableJIT(3)
## [1] 3
library(ggplot2)
library("Cairo")
changetoM <- function ( position ){
position=position/1000000;
paste(position, "M", sep="")
}
data =read.table("anchorwave.anchors", head=TRUE)
data$refChr = paste("chr", data$refChr, sep="")
data$queryChr = paste("chr", data$queryChr, sep="")
data = data[which(data$refChr %in% c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10" )),]
data = data[which(data$queryChr %in% c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10" )),]
data$refChr = factor(data$refChr, levels=c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10" ))
data$queryChr = factor(data$queryChr, levels=c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10" ))
data$strand = factor(data$strand)
data = data[which(data$gene != "intergenetic"),]
p = ggplot(data=data, aes(x=queryStart, y=referenceStart))+geom_point(size=2, aes(color=factor(strand)))+facet_grid(refChr~queryChr, scales="free", space="free" )+ theme_grey(base_size = 60) +
labs(x="sorghum", y="maize")+scale_x_continuous(labels=changetoM) + scale_y_continuous(labels=changetoM) +
theme(axis.line = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(fill=NA,color="black", size=0.5, linetype="solid"),
axis.text.y = element_text( colour = "black"),
legend.position='none',
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black") )
CairoPNG(file="anchors.png",width = 4800, height = 3200)
p
dev.off()
## png
## 2
CairoPDF(file="anchors.pdf",width = 60, height = 40)
p
dev.off()
## png
## 2
data =read.table("anchorwave.anchors", head=TRUE)
data$refChr = paste("chr", data$refChr, sep="")
data$queryChr = paste("chr", data$queryChr, sep="")
data = data[which(data$refChr %in% c("chr4", "chr5")),]
data = data[which(data$queryChr %in% c("chr4", "chr5" )),]
data$refChr = factor(data$refChr, levels=c( "chr4", "chr5" ))
data$queryChr = factor(data$queryChr, levels=c( "chr4", "chr5" ))
data$strand = factor(data$strand)
data = data[which(data$gene != "intergenetic"),]
p = ggplot(data=data, aes(x=queryStart, y=referenceStart))+geom_point(size=2, aes(color=factor(strand)))+facet_grid(refChr~queryChr, scales="free", space="free" )+ theme_grey(base_size = 24) +
labs(x="sorghum", y="maize")+scale_x_continuous(labels=changetoM) + scale_y_continuous(labels=changetoM) +
theme(axis.line = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(fill=NA,color="black", size=0.5, linetype="solid"),
axis.text.y = element_text( colour = "black"),
legend.position='none',
axis.text.x = element_text(angle=300, hjust=0, vjust=1, colour = "black") )
CairoPNG(file="anchor2.png",width = 720, height = 560)
p
dev.off()
## png
## 2
CairoPDF(file="anchor2.pdf",width = 9, height = 7)
p
dev.off()
## png
## 2
# this plot suggested there is no Interchromosomal translocations happened